% Program using a maximum likelihood estimator for binary
% outcome model
% g(E(Y) | V) = beta' V
% for Y_i binary: 0 or 1
%
% The explanatory variables V (of which there are m) are either binary or
% continuous; think of V as a m-dimensional vector for each data point.
% Given n data points, the idea is to find the best estimate for the
% m-dimensional coefficient vector beta, using a maximum likelihood method
%
%
% This program takes in data as input, and performs statistics by varying
% the initial condition for beta
%
% Update on last year's file to take *selected* columns as the explanatory
% variables
% 
%
% The program then implements a Newton-Raphson scheme to compute
% the maximum likelihood beta, based on the modification of the weighting
% function from possessing singularities when the probabilities are close
% to 0 or 1, to the modified (SAMLE) version
%
% Sanjeeva Balasuriya
% May, 2024

% Added a process for selecting from the data -- to try to get datapoints
% which had probability of being close to 1 if y_i were 1, and close to
% 0 if y_i were 0.
% Having selected this data by running once, the program runs with only
% that data.
%
%  June, 2024



clear all; close all; clear figures
warning('off','all'); warning

% Setting up parameters and options
%
% Data file (first column in file is binary outcome variable)
%filename = 'myopia_edited.csv'; % 
filename = 'polypharm_missingremoved.csv'; % 
delimiterIn = ','; % delimiter in file
headerlinesIn = 1; % number of header lines
%covariates = [2 4 5 6 7 8 13 14 15] ; % use for myopia data
covariates = [2 3 4 5 6 8 9 10] ; % used for polypharm data % 
% column numbers of the independent variables read from file 
% NOTE: First column is always the dependent 0-1 outcome variable


%
% Parameters/options
rounding_calculations = 'no'; % whether to use rounding of digits of precision in Newton-Raphson iteration
round_digits = 12; % number of significant digits to keep if doing above
maxNR = 50; % maximum number of iterations for Newton-Raphson % 500
epsNR = 1e-2; % stop N-R when difference in betas is below this value (can't make this too small if using rounding!) 1e-2
method = "logit"; % (choose "atanh" or "log" or "logit"  or "linear" -- these are the link functions) % logit
n_realizations = 10000; % number of initial guesses to choose for beta % 10000
alpha1 = 0.3; % choose data such that G(beta^top v) \in [0,alpha1] whenever y_i = 0, % 0.3
alpha2 = 0.3; % choose data such that G(beta^top v) \in [1-alpha2,1] if y_i = 1 % 0.3
% NOTE: choose alpha1=alpha2=0.3 for getting a selection of data which
% illustrates the utility of SAMLE (for polypharm data), and choose alpha1=alpha2=1 to use the
% full data set instead
write_out = 'no'; % whether to write to file the chosen data


    % define functions
    [g,G,Gprime,Gprimeprime,G_generate,weight_function_g,weight_function_b,sup_norm] = ...
        functions_define(method);
    
    % read data
    A = importdata(filename,delimiterIn,headerlinesIn);
    y = A.data(:,1); % outcome is in first column
    n = length(y);
    m = length(covariates);
    v_unscaled = nan(n,m);
    for j = 1: m
     v_unscaled(:,j) = A.data(:,covariates(j)); % pick relevant columns
    end
    % identify any individuals with missing data and abort if there are any
    [rowNaN, colNaN] = find(isnan(v_unscaled));
    if(~isempty(rowNaN))
        return
    end
    n_new = length(y); % number of individuals changes is data is culled
    % scale each variable by maximum to ensure that v's are "comparable
    % in size" to improve chances of convergence, and of getting betas
    % which are also comparable in size
    vm_max = max(abs(v_unscaled),[],1);
    v = v_unscaled ./ vm_max;
    
    %
    % STEP 1: WORK ON FULL DATA   
    %

    % Do computations (good/SAMLE method) for different initial guesses for beta
    %
    n_iterations = zeros(n_realizations,1); % number of iterations in convergent situation
    beta_final = zeros(n_realizations,m);
    for realization = 1: n_realizations
        % Initialize
        rng shuffle  % so that a different initial beta is used each time
        beta_guess(realization,1:m)= randn(m,1);
        beta_old = beta_guess(realization,:);
        % Call Newton-Raphson algorithm with good/SAMLE weighting function
        if strcmp(rounding_calculations,'yes') % use rounding if asked to do so
            [beta_new,r] = newton_raphson_round(epsNR,maxNR,n_new,m,G,Gprime,Gprimeprime,...
            weight_function_g,sup_norm,beta_old,y,v,round_digits);
        else
            [beta_new,r] = newton_raphson(epsNR,maxNR,n_new,m,G,Gprime,Gprimeprime,...
            weight_function_g,sup_norm,beta_old,y,v);
        end
        % Record data
        n_iterations(realization) = r-1; % will be infinity if divergent
        beta_final(realization,:) = beta_new;     
    end
    % rescale beta so that it is the appropriate vector for the unscaled vs
    beta = beta_final ./ vm_max;
    beta_best = mode(beta((all((~isnan(beta)),2)),:)); 
                 %eliminate the NaNs, and choose the beta which occurs most
    


    
    %
    % STEP 2: USE THE BETA FROM THESE SAMLE COMPUTATIONS TO ELIMINATE SOME DATA
    %

    [y_new,v_unscaled_new,n_new,keep_index] = eliminate_data(v_unscaled,y,beta_best,G,...
             alpha1,alpha2,write_out);
    vm_max = max(abs(v_unscaled_new),[],1);
    v_new = v_unscaled_new ./ vm_max;



    % 
    % STEP 3: REDO CALCULATIONS FOR NEW DATA (BUT DO BOTH SAMLE AND
    % NONSAMLE NOW)
    %

% Do computations (good/SAMLE method) for different initial guesses for beta
    %
    n_iterations = zeros(n_realizations,1); % number of iterations in convergent situation
    beta_final = zeros(n_realizations,m);
    for realization = 1: n_realizations
        % Initialize
        rng shuffle  % so that a different initial beta is used each time
        beta_guess(realization,1:m)= randn(m,1);
        beta_old = beta_guess(realization,:);
        % Call Newton-Raphson algorithm with good/SAMLE weighting function
        if strcmp(rounding_calculations,'yes') % use rounding if asked to do so
            [beta_new,r] = newton_raphson_round(epsNR,maxNR,n_new,m,G,Gprime,Gprimeprime,...
            weight_function_g,sup_norm,beta_old,y_new,v_new,round_digits);
        else
            [beta_new,r] = newton_raphson(epsNR,maxNR,n_new,m,G,Gprime,Gprimeprime,...
            weight_function_g,sup_norm,beta_old,y_new,v_new);
        end
        % Record data
        n_iterations(realization) = r-1; % will be infinity if divergent
        beta_final(realization,:) = beta_new;     
    end
    % rescale beta so that it is the appropriate vector for the unscaled vs
    beta = beta_final ./ vm_max;
    %beta_best = mode(beta((all((~isnan(beta)),2)),:)); 
    % Now, since the betas are quite distributed, the mode isn't great --
    % instead choose the mean
    beta_best = nanmean(beta,1);




    % Do computations (bad/standard method) for different initial guesses for beta
    %
    n_iterations_b = zeros(n_realizations,1);
    beta_final_b = zeros(n_realizations,m);
    for realization = 1: n_realizations
        % Initialize
        rng shuffle  % so that a different initial beta is used each time
        beta_guess(realization,:)= randn(m,1);
        beta_old = beta_guess(realization,:);
        % Call Newton-Raphson algorithm with bad/standard weighting function
        if strcmp(rounding_calculations,'yes') % use rounding if asked to do so
            [beta_new,r] = newton_raphson_round(epsNR,maxNR,n_new,m,G,Gprime,Gprimeprime,...
            weight_function_g,sup_norm,beta_old,y_new,v_new,round_digits);
        else
            [beta_new,r] = newton_raphson(epsNR,maxNR,n_new,m,G,Gprime,Gprimeprime,...
            weight_function_b,sup_norm,beta_old,y_new,v_new);
        end
        % Record data
        n_iterations_b(realization) = r-1;
        beta_final_b(realization,:) = beta_new;     
    end
    % rescale beta so that it is the appropriate vector for the unscaled vs
    beta_b = beta_final_b ./ vm_max;
    %beta_best_b = mode(beta_b((all((~isnan(beta_b)),2)),:));
                 %eliminate the NaNs, and choose the beta which occurs most
    %Because the betas are quite distributed, better now to use mean
    beta_best_b = nanmean(beta_b,1);
   
 
    
% Obtain histograms for iterations for convergence, and plot them    
[iteration_number,convergent_fraction,convergent_fraction_b, ...
    divergent_fraction,divergent_fraction_b] ...
    = plot_histograms(n_iterations,n_iterations_b,n_realizations);
fraction_converging = 1 - divergent_fraction
fracton_converging_b = 1 - divergent_fraction_b
    

% Obtain histograms for expected values in the generalized linear model,
% and plot them
[y_expect,y_expect_b] = expectations(beta_best,beta_best_b,y_new,v_unscaled_new,G);

% Compute variation of the beta values in the convergent ones
[beta_vary, beta_b_vary] = beta_variation(divergent_fraction,...
          divergent_fraction_b,beta,beta_b,covariates,round_digits,...
          filename,rounding_calculations);






